* This program uses PERL code within SAS to identify observations with a given character string;
* Written by Richard Reeves of the National Student Clearinghouse;
* Modified by Karen Pence March 2007;

options ps = 100 ls = 100;

libname in "/data";
libname nsc "/data";




        * Limit the sample to observations with non-missing degrees;
        * Otherwise the program runs out of memory;
        DATA degree_only;
        SET in.nsc_cleaned_forsas;
        if Degree_Title ^= " ";
        if  COL_year24 ^= "L";
        run;

        DATA temp;
        SET degree_only;
        LENGTH DEG_GRP $20;
        

        * the "/i" option specifies a case-insensitive search;
        * Note that "." matches any single character except a new line.  Thus we need a backslash before it.;

        /* What some degrees stand for:
          BSpattern: BM (Bachelor of Music), BME (Biomedical/mechanical engineering),
                     B C E (Bachelor of civil engineering), BGS (Bachelor of General Studies), 
                     BESS (Bachelor of Exercise and Sport Science)
           MApattern: M O T (Master of technology; similar to an MBA), M P P (Master of Public Policy), M ED (Master
                     of Education), EDS/Education Specialist/Educational Specialist (a degree in education after the
                     masters-it requires only a year of coursework so I am not counting it as a doctorate), M S
                     Specialist in School Psychology (like Educational Specialist)
          DRpattern:  DPH (Doctor of Public Health)
          AApattern2: ALS (Associate in liberal studies)
          */

        hspattern = "/DIPLOMA|HIGH SCHOOL|GED|MINIMUM HS|RECOMMENDED HS/i";

        * (?<!B)AA keeps BAAS from resolving as an associate's degree;
        AApattern = "/(?<!B)AA|A\.A\.|ASSOC|A\.SCI\.|ASC|A\.S\.|\bAG-|\bAG |A\.A/i";

        * An examination of the data suggests that I want to code "Bachelor of Science/Master of Science
          as a Master's degree (look at person 3639);

        BSpattern = "/BS|B\.S\.|B\. S\. |B\.F\.A\.|BFA|B F A|BAC(?!HELOR OF SCIENCE\/MASTER)|"||
                    "BACELOR|BCH ARTS|BM|BME|B M E|B C E|"||
                    "B MUS|B.ARCH|BGS|BESS|BA IN|ADDITIONAL MAJOR/i";

        LAWpattern = "/JD|J D|J\.D\.|JURIS|LAW|LLM/i";

        MBApattern = "/MBA|M B A|MBA|M\.B\.A\.|M O T|BUS|ACCOUNT|ACCT|MGMT|MANAGEMENT INF|TAXATION/i";

/*
        MApattern = "/MASTER|MS|M\.S\.|M S|M S W|M\.A\.|MPA|M\.P\.A\.|MFA|M\.F\.A\.|M F A|MHA|M\.H\.A\.|M\.A\.T\.|"||
                    "MPH|MPS|M P P|EDS|EDUCATION SPECIALIST|EDUCATIONAL SPECIALIST|M ARCH|SPECIALIST IN|PROFES/i";
*/

        MApattern = "/MASTER|MS|M\.S\.|M S|M S W|M\.A\.|MPA|M\.P\.A\.|MFA|M\.F\.A\.|M F A|MHA|M\.H\.A\.|M\.A\.T\.|"||
                    "MPH|MPS|M P P|\bMED\b|" ||
                    "EDS|EDUCATION SPECIALIST|EDUCATIONAL SPECIALIST|M ARCH|SPECIALIST IN|PROFES/i";

        DRpattern = "/DOCTOR|PHD|PH D|PH\.D\.|MD|M\.D\.|M D|DVM|EDD|D\.V\.M\.|E\.D\.D\.|DPH|DR/i";

        CEpattern = "/CERTIFICATE|CERT|ELECTRICIAN|WELDING|CREDENTIAL|VOCATIONAL|LICENSURE|APPRENTICE/i";

        BSpattern2 = "/B S|BA|B A |B A|BBA |B OF|AB|B\.A\./i";

        AApattern2 = "/ASSOC|AS|ALS|APPLIED AND LIBERAL /i";

        MApattern2 = "/MA|MA-|M ED /i";

        OTpattern = "/PROVISIONAL|HONOR SOCIETY|SPECIALIZATION|NCATE|DISTINGUISHED|OTHER|AWARD/i";
              
        *prxparse parses the list of degree patterns, prxsubstr reports whether the string was
        found, at what position it starts, and how long it is, and prxfree frees up the memory again;

        CEExpID = prxparse(CEpattern);
        CALL prxsubstr(CEExpID, DEGREE_TITLE, CEpos, CElen);
        IF CEpos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "CERTIFICATE";     
        END;
        CALL PRXFREE(CEExpID);

        HSExpID = prxparse(hspattern);
        CALL prxsubstr(HSExpID, DEGREE_TITLE, hspos, hslen);
        IF hspos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "HIGH SCHOOL"; 
        END;
        CALL PRXFREE(HSExpID);
        
        AAExpID = prxparse(AApattern);
        CALL prxsubstr(aaExpID, DEGREE_TITLE, aapos, aalen);
        IF aapos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "ASSOCIATE";       
        END;
        CALL PRXFREE(AAExpID);

        BSExpID = prxparse(BSpattern);
        CALL prxsubstr(BSExpID, DEGREE_TITLE, BSpos, BSlen);
        IF BSpos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "BACHELOR";        
        END;
        CALL PRXFREE(BSExpID);

        LAWExpID = prxparse(LAWpattern);
        CALL prxsubstr(LAWExpID, DEGREE_TITLE, LAWpos, LAWlen);
        IF LAWpos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "LAW";  
        END;      
        CALL PRXFREE(LAWExpID);

        MBAExpID = prxparse(MBApattern);
        CALL prxsubstr(MBAExpID, DEGREE_TITLE, MBApos, MBAlen);
        IF MBApos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "MBA";  
        END;      
        CALL PRXFREE(MBAxpID);

        MAExpID = prxparse(MApattern);
        CALL prxsubstr(MAExpID, DEGREE_TITLE, MApos, MAlen);
        IF MApos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "MASTER";  
        END;      
        CALL PRXFREE(MAExpID);
        
        DRExpID = prxparse(DRpattern);
        CALL prxsubstr(DRExpID, DEGREE_TITLE, DRpos, DRlen);
        IF DRpos ^= 0 AND DEG_GRP="" then do;           
           DEG_GRP = "DOCTOR";  
        END;      
        CALL PRXFREE(DRExpID);
        
        BS2ExpID = prxparse(BSpattern2);
        CALL prxsubstr(BS2ExpID, DEGREE_TITLE, BS2pos, BS2len);
        IF BS2pos ^= 0 AND DEG_GRP="" then do;
           DEG_GRP = "BACHELOR";
        END;
        CALL PRXFREE(BS2ExpID);
        
       AA2ExpID = prxparse(AApattern2);
       CALL prxsubstr(AA2ExpID, DEGREE_TITLE, AA2pos, AA2len);
       IF aa2pos ^= 0 AND DEG_GRP="" then do;
          DEG_GRP = "ASSOCIATE";
       END;
       CALL PRXFREE(AA2ExpID);

       MA2ExpID = prxparse(MApattern2);
       CALL prxsubstr(MA2ExpID, DEGREE_TITLE, MApos, MAlen);
       IF MApos ^= 0 AND DEG_GRP=""    then do;
          DEG_GRP = "MASTER";  
       END;
       CALL PRXFREE(MA2ExpID);

       OTExpID = prxparse(OTpattern);
       CALL prxsubstr(OTExpID, DEGREE_TITLE, OTpos, OTlen);
       IF OTpos ^= 0 AND DEG_GRP=""    then do;
          DEG_GRP = "OTHER";  
       END;
       CALL PRXFREE(OTExpID);
RUN;

proc freq data = temp;
  tables deg_grp;
run;

proc sort nodupkey data = temp out=degree_list;
  by DEGREE_TITLE;
run;

data nsc.degree_list;
  set degree_list (keep = deg_grp DEGREE_TITLE);
run;

proc contents data = nsc.degree_list;
run;

* Listing degree titles within the categories;
proc sort data = temp;
  by deg_grp DEGREE_TITLE;
run;

proc freq data = temp;
  tables DEGREE_TITLE /nocum;
  by deg_grp;
run;

* 6,260 persons have valid degree data on the dataset;
proc sort nodupkey data = temp out=num_person;
  by TUEDT2003;
run;
